1 Introduction
This webbook contains all the code used for data analysis in study of the population-level metagenomic data of Podarcis muralis lizards across elevational gradients in various mountain ranges of the Pyrenees.
1.1 Prepare the R environment
1.1.1 Environment
To reproduce all the analyses locally, clone this repository in your computer using:
RStudio > New Project > Version Control > Git
And indicating the following git repository:
Once the R project has been created, follow the instructions and code chunks shown in this webbook.
1.1.2 Libraries
The following R packages are required for the data analysis.
# Base
library(R.utils)
library(knitr)
library(tidyverse)
library(devtools)
library(tinytable)
# For tree handling
library(ape)
library(phyloseq)
library(phytools)
# For plotting
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(ggnewscale)
library(gridExtra)
library(ggtreeExtra)
library(ggtree)
library(ggh4x)
# For statistics
library(spaa)
library(vegan)
library(Rtsne)
library(geiger)
library(hilldiv2)
library(distillR)
library(broom.mixed)
library(Hmsc)
library(corrplot)2 Prepare data
2.1 Load data
Load the original data files outputted by the bioinformatic pipeline.
2.2 Create working objects
Transform the original data files into working objects for downstream analyses.
2.3 Prepare color scheme
AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)3 Data summary
Summary of sampled individuals and analysed faecal samples.
[1] 105
#number of samples by transect
sample_metadata %>%
group_by(Transect) %>%
summarise(n_samples = length(EHI_number)) %>%
tt()| Transect | n_samples |
|---|---|
| Aisa | 22 |
| Aran | 37 |
| Sentein | 19 |
| Tourmalet | 27 |
#number of samples by transect and elevation
sample_metadata %>%
group_by(Transect, Elevation) %>%
summarise(n_samples = length(EHI_number)) %>%
tt()| Transect | Elevation | n_samples |
|---|---|---|
| Aisa | 1250 | 6 |
| Aisa | 1450 | 6 |
| Aisa | 1650 | 4 |
| Aisa | 1850 | 6 |
| Aran | 1000 | 6 |
| Aran | 1080 | 7 |
| Aran | 1340 | 5 |
| Aran | 1500 | 6 |
| Aran | 1650 | 7 |
| Aran | 1850 | 6 |
| Sentein | 941 | 5 |
| Sentein | 1260 | 4 |
| Sentein | 1628 | 5 |
| Sentein | 1873 | 5 |
| Tourmalet | 953 | 5 |
| Tourmalet | 1255 | 4 |
| Tourmalet | 1561 | 4 |
| Tourmalet | 1797 | 4 |
| Tourmalet | 2065 | 2 |
| Tourmalet | 2134 | 3 |
| Tourmalet | 2306 | 5 |
[1] 106
Geographical location of sampled lizards in the Pyrenees.
#Summarise for generating map
options(dplyr.summarise.inform = FALSE)
sample_metadata_summary <- sample_metadata %>%
#Group by geography and count samples
select(EHI_number, latitude, longitude, Transect) %>%
group_by(latitude, longitude, Transect) %>%
summarize(count = n()) %>%
ungroup()
#plotting on map
## Determine the longitude and latitude ranges
lon_range <- range(sample_metadata_summary$longitude, na.rm = TRUE)
lat_range <- range(sample_metadata_summary$latitude, na.rm = TRUE)
sample_metadata_summary %>%
ggplot(.) +
#render map
geom_map(
data=map_data("world"),
map = map_data("world"),
aes(long, lat, map_id=region),
color = "white", fill = "#cccccc", linewidth = 0.2
) +
#render points
geom_point(
aes(x=longitude,y=latitude, color=Transect),
alpha=0.5, shape=16) +
#add general plot layout
theme_minimal() +
theme(legend.position = "right",
axis.title.x=element_blank(),
axis.title.y=element_blank()
) + coord_map("mercator", xlim = lon_range, ylim = lat_range)4 Data statistics
4.1 Sequencing reads statistics
sample_metadata %>%
summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2),
mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
tt()| Total | Average |
|---|---|
| 652.04 | 6.21 ± 2.77 |
4.3 DNA fractions
sequence_fractions <- read_counts %>%
pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
group_by(sample) %>%
summarise(mags = sum(value)) %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
mutate(mags_bases = mags*146) %>%
mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)
sequence_fractions %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
tt()| Sample | Low quality | Mapped to host | Unmapped | Mapped to MAGs |
|---|---|---|---|---|
| EHI00069 | 1.1441521 | 0.920532571 | 2.0517957 | 4.1058263 |
| EHI00070 | 0.2348919 | 0.937599880 | 0.8332809 | 1.6364476 |
| EHI00072 | 0.3717090 | 0.078319449 | 0.9507999 | 4.0430716 |
| EHI00073 | 0.1504762 | 0.007300164 | 0.6458066 | 1.8453666 |
| EHI00074 | 0.5700275 | 0.034671349 | 1.7906564 | 5.7526051 |
| EHI00075 | 0.2702205 | 0.444390387 | 0.9599202 | 2.4896265 |
| EHI00076 | 0.3350547 | 0.081391903 | 1.4172925 | 3.8389468 |
| EHI00077 | 0.1353495 | 0.641770627 | 0.3885242 | 1.3947384 |
| EHI00079 | 0.3377906 | 2.721037061 | 0.5653554 | 0.8084465 |
| EHI00080 | 0.7291964 | 1.605187153 | 2.3611523 | 2.6883254 |
| EHI00081 | 0.2184180 | 0.727448090 | 0.5615342 | 1.3358755 |
| EHI00085 | 0.2300087 | 0.212261834 | 0.5665817 | 1.9281604 |
| EHI00086 | 0.3471176 | 0.417593612 | 1.0770869 | 1.7440344 |
| EHI00088 | 0.2278663 | 0.071769210 | 0.5556301 | 3.0141044 |
| EHI00089 | 0.1751910 | 0.034915171 | 0.5162821 | 1.9576155 |
| EHI00091 | 0.2622745 | 0.230039482 | 0.8255133 | 2.4924602 |
| EHI00092 | 0.3917724 | 0.170842450 | 1.0255541 | 4.9181624 |
| EHI00093 | 0.3725739 | 0.199633318 | 1.1938025 | 2.3137473 |
| EHI00095 | 0.1768550 | 0.255990148 | 0.3449738 | 1.9242079 |
| EHI00097 | 0.5249214 | 0.366551383 | 0.8028468 | 2.9685729 |
| EHI00098 | 0.3176395 | 0.040047629 | 1.7271586 | 3.1498120 |
| EHI00100 | 0.4386907 | 2.398470758 | 0.8228622 | 4.1093643 |
| EHI00101 | 0.7257361 | 0.451928365 | 4.9291736 | 1.9943098 |
| EHI00103 | 0.2036111 | 0.843529516 | 0.5714195 | 1.3395113 |
| EHI00104 | 0.1769983 | 0.643701669 | 0.6807099 | 1.2574891 |
| EHI00105 | 0.2711051 | 0.005694785 | 0.9087805 | 2.7311372 |
| EHI00106 | 1.1082626 | 1.058626752 | 1.8709694 | 2.0746217 |
| EHI00107 | 0.3818355 | 0.184257358 | 1.5836979 | 3.0592637 |
| EHI00108 | 0.3420046 | 0.394171066 | 0.5356632 | 2.7441687 |
| EHI00110 | 0.3648881 | 0.591747269 | 0.5588077 | 2.6145261 |
| EHI00111 | 0.4392502 | 0.151954279 | 1.1992388 | 3.5773008 |
| EHI00112 | 0.7134927 | 0.085183185 | 1.1328903 | 3.6469985 |
| EHI00113 | 0.8620893 | 0.617092903 | 1.4939464 | 7.3254896 |
| EHI00114 | 0.4127445 | 0.087174427 | 2.3985081 | 4.6641659 |
| EHI00115 | 0.6290438 | 0.062352261 | 2.6081814 | 3.0085937 |
| EHI00116 | 0.4130247 | 0.039596686 | 0.8559983 | 4.7270478 |
| EHI00117 | 0.6850197 | 0.423816725 | 1.2454024 | 7.0782364 |
| EHI00118 | 0.3963952 | 0.784391517 | 0.9049477 | 3.7006953 |
| EHI00119 | 0.4448016 | 0.038727564 | 1.5277021 | 4.6761692 |
| EHI00120 | 0.3549466 | 0.051038023 | 0.7165254 | 3.8122875 |
| EHI00121 | 0.2487524 | 1.196143747 | 0.9172490 | 1.9448859 |
| EHI00122 | 0.3967333 | 0.056247498 | 1.7140526 | 5.3880487 |
| EHI00124 | 0.3101617 | 0.113171041 | 1.6986936 | 3.2732918 |
| EHI00125 | 0.3008464 | 0.313488992 | 1.2381544 | 3.1620784 |
| EHI00128 | 0.3688688 | 0.149336152 | 1.0558709 | 2.9648899 |
| EHI00129 | 0.2827708 | 0.212726094 | 0.7352845 | 2.7738704 |
| EHI00130 | 0.2917071 | 0.082344877 | 1.0170695 | 2.1151693 |
| EHI00131 | 0.1940832 | 0.943415382 | 0.4863996 | 1.2048589 |
| EHI00133 | 0.2569782 | 0.429506925 | 0.3484713 | 3.3747982 |
| EHI00134 | 0.5620395 | 1.988142143 | 1.9520876 | 3.9427347 |
| EHI00137 | 0.2948339 | 0.250489910 | 1.0341318 | 2.4844583 |
| EHI00138 | 0.2673241 | 0.179238573 | 0.5826993 | 2.8578460 |
| EHI00139 | 0.3001940 | 0.034225947 | 0.9413847 | 2.1426906 |
| EHI00176 | 0.7041777 | 0.100761385 | 0.7196926 | 2.7465205 |
| EHI00177 | 0.2555111 | 0.016053624 | 0.5228890 | 2.1639513 |
| EHI00178 | 0.2415179 | 0.045338274 | 1.3634101 | 2.6038297 |
| EHI00179 | 0.1816121 | 0.023148016 | 0.9401620 | 2.3926575 |
| EHI00180 | 0.5363481 | 0.037200326 | 0.7786810 | 2.4699635 |
| EHI00181 | 0.1978972 | 0.301399301 | 0.4733087 | 2.4456488 |
| EHI00422 | 0.1660426 | 0.058188268 | 0.4631427 | 2.3301737 |
| EHI00426 | 0.6048076 | 0.374841259 | 1.6091057 | 5.6754435 |
| EHI00428 | 0.4361908 | 0.354862015 | 0.9866519 | 3.7380622 |
| EHI00433 | 0.6941201 | 0.730117194 | 1.6017157 | 3.7066518 |
| EHI00438 | 0.2938182 | 0.100587490 | 0.8490354 | 4.7546269 |
| EHI00441 | 0.5217550 | 0.202212815 | 2.3199030 | 6.8309541 |
| EHI00451 | 0.4696437 | 0.318898071 | 1.2315115 | 5.0794188 |
| EHI00456 | 0.5747753 | 0.097152439 | 0.6337192 | 4.7496272 |
| EHI00458 | 0.5541280 | 0.326249839 | 1.7364884 | 4.9185106 |
| EHI00462 | 0.5139086 | 0.330809687 | 1.3870295 | 4.8485639 |
| EHI00464 | 0.5604449 | 0.962080506 | 1.5487451 | 5.0533364 |
| EHI00465 | 0.1355889 | 0.041795770 | 0.7088416 | 1.4262076 |
| EHI00467 | 0.4291035 | 0.854761759 | 1.6444666 | 3.7590494 |
| EHI00470 | 0.6625583 | 1.679223689 | 1.8675738 | 8.1840405 |
| EHI00472 | 0.3619652 | 0.586418138 | 1.8074071 | 3.3732380 |
| EHI00473 | 0.4841385 | 0.477433785 | 1.2481742 | 4.8043613 |
| EHI00477 | 0.4782444 | 0.087653441 | 2.7186490 | 3.7317223 |
| EHI00479 | 0.8395454 | 0.074374376 | 1.9936025 | 12.8156007 |
| EHI00480 | 0.7475415 | 0.793212202 | 3.1481051 | 4.3553735 |
| EHI00481 | 0.5040521 | 0.094726409 | 1.3739735 | 5.3792865 |
| EHI00483 | 0.4545170 | 1.532171008 | 0.8653232 | 4.7402692 |
| EHI00484 | 0.3037426 | 0.269250378 | 1.3870200 | 5.2563238 |
| EHI00488 | 0.4900356 | 2.212642337 | 2.0850126 | 4.3632065 |
| EHI00490 | 0.6234248 | 0.180662818 | 3.5993286 | 7.8297890 |
| EHI00493 | 0.4026458 | 2.030328521 | 2.7098633 | 2.4780854 |
| EHI00496 | 0.4536382 | 0.415085470 | 1.7045368 | 6.1699650 |
| EHI00499 | 3.5142773 | 2.204637514 | 3.4291586 | 3.2083353 |
| EHI00504 | 0.5335873 | 2.167943278 | 1.2709452 | 4.9663386 |
| EHI00506 | 0.7953759 | 0.639271744 | 1.7418071 | 8.0883653 |
| EHI00507 | 0.4077748 | 0.136492692 | 1.5474048 | 5.5083429 |
| EHI00512 | 0.6299478 | 8.067540048 | 0.5964127 | 1.3433168 |
| EHI00514 | 0.4590217 | 0.336676348 | 1.5023882 | 6.2571308 |
| EHI00518 | 0.4082296 | 0.012341521 | 1.0122951 | 5.8379043 |
| EHI00524 | 0.4918121 | 1.320091255 | 2.0577845 | 4.8996823 |
| EHI00525 | 0.3238390 | 0.105964164 | 1.1420157 | 5.0531429 |
| EHI00527 | 0.5623443 | 0.128178226 | 2.3107744 | 7.6106934 |
| EHI00529 | 0.3350620 | 0.129930230 | 1.3102065 | 5.1130080 |
| EHI00536 | 0.4725239 | 0.010876654 | 1.2534820 | 7.7176039 |
| EHI00537 | 0.3665586 | 0.089579032 | 2.4631156 | 1.4492223 |
| EHI00538 | 0.4501201 | 2.638414328 | 0.6397030 | 3.1637601 |
| EHI00541 | 0.7086242 | 0.152318595 | 1.4153324 | 6.8350770 |
| EHI00547 | 1.0225445 | 2.648359289 | 7.0526142 | 6.1760904 |
| EHI00566 | 0.5706734 | 0.218664855 | 1.7237625 | 7.3779017 |
| EHI00567 | 0.4565067 | 0.133011864 | 1.9051503 | 4.4466414 |
| EHI00568 | 0.5246686 | 0.219110833 | 1.7989251 | 3.5919103 |
| EHI00569 | 0.6859077 | 0.169999886 | 2.6761990 | 6.7957645 |
sequence_fractions %>%
pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
mutate(value = value / 1000000000) %>%
mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
ggplot(., aes(x = sample, y = value, fill=fraction)) +
geom_bar(position="stack", stat = "identity") +
scale_fill_manual(name="Sequence type",
breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")4.4 Recovered microbial fraction
singlem_table <- sequence_fractions %>%
mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
select(sample,mags_proportion,singlem_proportion) %>%
mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify
singlem_table %>%
pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
ggplot(., aes(x = value, y = sample, color=proportion)) +
geom_line(aes(group = sample), color = "#f8a538") +
geom_point() +
scale_color_manual(name="Proportion",
breaks=c("mags_proportion","singlem_proportion"),
labels=c("Recovered","Estimated"),
values=c("#52e1e8", "#876b53"))+
facet_nested(species + sample_type ~ ., scales="free",space="free")+
theme_classic() +
labs(y = "Samples", x = "Prokaryotic fraction (%)") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),
legend.position = "right",
strip.background.y=element_rect(color = NA, fill= "#f4f4f4"))5 MAG catalogue
5.1 Genome phylogeny
# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(genome,phylum) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome")
# Generate basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
ggtree(., layout="fan", open.angle=10, size=0.5)***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.55, width=0.1, colnames=FALSE) +
scale_fill_manual(values=phylum_colors) +
geom_tiplab2(size=1, hjust=-0.1) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))
# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()
# Add completeness ring
circular_tree <- circular_tree +
new_scale_fill() +
scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
geom_fruit(
data=genome_metadata,
geom=geom_bar,
mapping = aes(x=completeness, y=genome, fill=contamination),
offset = 0.55,
orientation="y",
stat="identity")
# Add genome-size ring
circular_tree <- circular_tree +
new_scale_fill() +
scale_fill_manual(values = "#cccccc") +
geom_fruit(
data=genome_metadata,
geom=geom_bar,
mapping = aes(x=length, y=genome),
offset = 0.05,
orientation="y",
stat="identity")
# Add text
circular_tree <- circular_tree +
annotate('text', x=3.4, y=0, label=' Phylum', family='arial', size=3.5) +
annotate('text', x=4.3, y=0, label=' Genome quality', family='arial',size=3.5)+
annotate('text', x=4.8, y=0, label=' Genome size', family='arial', size=3.5)
#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)5.2 Genome quality
genome_metadata %>%
summarise(completeness_mean=mean(completeness) %>% round(2) %>% as.character(),
completeness_sd=sd(completeness) %>% round(2) %>% as.character(),
contamination_mean=mean(contamination) %>% round(2),
contamination_sd=sd(contamination) %>% round(2)) %>%
unite("Completeness",completeness_mean, completeness_sd, sep = " ± ", remove = TRUE) %>%
unite("Contamination",contamination_mean, contamination_sd, sep = " ± ", remove = TRUE) %>%
tt()| Completeness | Contamination |
|---|---|
| 79.99 ± 15.48 | 2.19 ± 1.94 |
#Generate quality biplot
genome_biplot <- genome_metadata %>%
select(c(genome,domain,phylum,completeness,contamination,length)) %>%
arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
geom_point(alpha=0.7) +
ylim(c(10,0)) +
scale_color_manual(values=phylum_colors) +
labs(y= "Contamination", x = "Completeness") +
theme_classic() +
theme(legend.position = "none")
#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
ggplot(aes(y=contamination)) +
ylim(c(10,0)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_void() +
theme(legend.position = "none",
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)
#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
ggplot(aes(x=completeness)) +
xlim(c(50,100)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_void() +
theme(legend.position = "none",
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)
#Render composite figure
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3)))5.3 Functional overview
# Aggregate basal GIFT into elements
function_table <- genome_gifts %>%
to.elements(., GIFT_db)
# Generate basal tree
function_tree <- force.ultrametric(genome_tree, method="extend") %>%
ggtree(., size = 0.3) ***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
scale_fill_manual(values=phylum_colors) +
labs(fill="Phylum")
#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()
#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
vexpand(.08) +
coord_cartesian(clip = "off") +
scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
labs(fill="GIFT")
#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()
# Add completeness barplots
function_tree <- function_tree +
geom_fruit(data=genome_metadata,
geom=geom_bar,
grid.params=list(axis="x", text.size=2, nbreak = 1),
axis.params=list(vline=TRUE),
mapping = aes(x=length, y=genome, fill=completeness),
offset = 3.8,
orientation="y",
stat="identity") +
scale_fill_gradient(low = "#cf8888", high = "#a2cc87") +
labs(fill="Genome\ncompleteness")
function_tree5.4 Functional ordination
# Generate the tSNE ordination
tSNE_function <- Rtsne(X=function_table, dims = 2, check_duplicates = FALSE)
# Plot the ordination
function_ordination <- tSNE_function$Y %>%
as.data.frame() %>%
mutate(genome=rownames(function_table)) %>%
inner_join(genome_metadata, by="genome") %>%
rename(tSNE1="V1", tSNE2="V2") %>%
select(genome,phylum,tSNE1,tSNE2, length) %>%
ggplot(aes(x = tSNE1, y = tSNE2, color = phylum, size=length))+
geom_point(shape=16, alpha=0.7) +
scale_color_manual(values=phylum_colors) +
theme_minimal() +
labs(color="Phylum", size="Genome size") +
guides(color = guide_legend(override.aes = list(size = 5))) # enlarge Phylum dots in legend
function_ordination6 Community composition
6.1 Taxonomy overview
6.1.1 Stacked barplot
genome_counts_filt_met<-genome_counts_filt %>%
mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
left_join(., sample_metadata, by = join_by(sample == EHI_number)) %>% #append sample metadata
filter(count > 0) #filter 0 counts
genome_counts_filt_met$Elevation<-as.factor(genome_counts_filt_met$Elevation)
# Create an interaction variable for elevation and sample
genome_counts_filt_met$interaction_var <- interaction(genome_counts_filt_met$sample, genome_counts_filt_met$Elevation)
# Plot stacked barplot
ggplot(genome_counts_filt_met, aes(x=interaction_var,y=count,fill=phylum, group=phylum))+ #grouping enables keeping the same sorting of taxonomic units
geom_bar(stat="identity", colour="white", linewidth=0.1)+ #plot stacked bars with white borders
scale_fill_manual(values=phylum_colors) +
labs(y = "Relative abundance", x="Elevation (m)") +
guides(fill = guide_legend(ncol = 3)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size=7))+
scale_x_discrete(labels = function(x) gsub(".*\\.", "", x)) +
facet_wrap(~Transect, scales = "free", labeller = as_labeller(function(label) gsub(".*\\.", "", label))) #only show elevation label6.1.2 Phylum relative abundances
phylum_summary <- genome_counts_filt %>%
mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
left_join(genome_metadata, by = join_by(genome == genome)) %>%
group_by(sample,phylum) %>%
summarise(relabun=sum(count))
phylum_summary %>%
group_by(phylum) %>%
summarise(total_mean=mean(relabun*100, na.rm=T),
total_sd=sd(relabun*100, na.rm=T)) %>%
mutate(total=str_c(round(total_mean,2),"±",round(total_sd,2))) %>%
arrange(-total_mean) %>%
dplyr::select(phylum,total) %>%
tt()| phylum | total |
|---|---|
| p__Bacillota_A | 40.6±15.93 |
| p__Bacteroidota | 38.3±15.88 |
| p__Pseudomonadota | 5.83±7.87 |
| p__Bacillota | 4.59±6.37 |
| p__Campylobacterota | 3.47±5.63 |
| p__Desulfobacterota | 2.68±4.05 |
| p__Verrucomicrobiota | 1.53±2.43 |
| p__Bacillota_C | 0.95±1.05 |
| p__Fusobacteriota | 0.91±3.7 |
| p__Cyanobacteriota | 0.4±0.6 |
| p__Spirochaetota | 0.22±0.97 |
| p__Bacillota_B | 0.19±0.32 |
| p__Actinomycetota | 0.17±0.52 |
| p__Chlamydiota | 0.1±0.61 |
| p__Deferribacterota | 0.05±0.27 |
| p__Synergistota | 0.01±0.08 |
phylum_arrange <- phylum_summary %>%
group_by(phylum) %>%
summarise(mean=mean(relabun)) %>%
arrange(-mean) %>%
select(phylum) %>%
pull()
phylum_summary %>%
filter(phylum %in% phylum_arrange) %>%
mutate(phylum=factor(phylum,levels=rev(phylum_arrange))) %>%
ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
scale_color_manual(values=phylum_colors[rev(phylum_arrange)]) +
geom_jitter(alpha=0.5) +
theme_minimal()6.2 Taxonomy boxplot
6.2.1 Family
family_summary <- genome_counts_filt %>%
mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>% #append sample metadata
left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
group_by(sample,family) %>%
summarise(relabun=sum(count))
family_summary %>%
group_by(family) %>%
summarise(mean=mean(relabun),sd=sd(relabun)) %>%
arrange(-mean) %>%
tt()| family | mean | sd |
|---|---|---|
| f__Lachnospiraceae | 2.798160e-01 | 0.1323151296 |
| f__Bacteroidaceae | 2.132576e-01 | 0.1045524501 |
| f__Tannerellaceae | 1.122881e-01 | 0.0756060991 |
| f__ | 6.556348e-02 | 0.0820773598 |
| f__Helicobacteraceae | 3.474580e-02 | 0.0562678198 |
| f__Marinifilaceae | 3.441228e-02 | 0.0255200299 |
| f__UBA3700 | 2.777694e-02 | 0.0330290932 |
| f__Desulfovibrionaceae | 2.680077e-02 | 0.0404627923 |
| f__Ruminococcaceae | 2.435712e-02 | 0.0226880677 |
| f__Rikenellaceae | 1.900710e-02 | 0.0176572117 |
| f__Erysipelotrichaceae | 1.767814e-02 | 0.0269603016 |
| f__Oscillospiraceae | 1.696763e-02 | 0.0140960977 |
| f__Coprobacillaceae | 1.273032e-02 | 0.0336927310 |
| f__Mycoplasmoidaceae | 1.210259e-02 | 0.0270262454 |
| f__Enterobacteriaceae | 1.077111e-02 | 0.0651262303 |
| f__Fusobacteriaceae | 9.053249e-03 | 0.0370456782 |
| f__Akkermansiaceae | 8.677367e-03 | 0.0108836065 |
| f__CAG-239 | 7.003283e-03 | 0.0098019806 |
| f__LL51 | 6.656864e-03 | 0.0216352391 |
| f__Anaerotignaceae | 6.551732e-03 | 0.0074061779 |
| f__UBA3830 | 5.185462e-03 | 0.0171469663 |
| f__Gastranaerophilaceae | 4.003188e-03 | 0.0059975426 |
| f__Muribaculaceae | 3.991492e-03 | 0.0409006227 |
| f__Butyricicoccaceae | 3.919222e-03 | 0.0050520985 |
| f__CAG-274 | 3.051617e-03 | 0.0060828463 |
| f__Acutalibacteraceae | 2.723615e-03 | 0.0047471119 |
| f__Anaerovoracaceae | 2.683446e-03 | 0.0040557355 |
| f__Pumilibacteraceae | 2.544525e-03 | 0.0041701010 |
| f__UBA1997 | 2.383785e-03 | 0.0080270055 |
| f__CAG-508 | 2.324078e-03 | 0.0063109845 |
| f__Brevinemataceae | 2.191263e-03 | 0.0097068603 |
| f__Peptococcaceae | 1.937490e-03 | 0.0031961496 |
| f__Rhodocyclaceae | 1.936615e-03 | 0.0194735479 |
| f__DTU072 | 1.792884e-03 | 0.0055918788 |
| f__UBA660 | 1.742458e-03 | 0.0054664441 |
| f__MGBC116941 | 1.714356e-03 | 0.0090980913 |
| f__Massilibacillaceae | 1.502892e-03 | 0.0024528334 |
| f__Eggerthellaceae | 1.377525e-03 | 0.0037174906 |
| f__Enterococcaceae | 8.454990e-04 | 0.0070848098 |
| f__CALTSX01 | 5.198677e-04 | 0.0053270584 |
| f__Chlamydiaceae | 5.170637e-04 | 0.0030492819 |
| f__Mucispirillaceae | 4.593259e-04 | 0.0026630822 |
| f__CALVMC01 | 4.496143e-04 | 0.0018583730 |
| f__Clostridiaceae | 4.212744e-04 | 0.0029132915 |
| f__Acidaminococcaceae | 4.004438e-04 | 0.0015777220 |
| f__UBA1242 | 3.719350e-04 | 0.0014704441 |
| f__RUG11792 | 3.717143e-04 | 0.0019425248 |
| f__CAG-465 | 3.497207e-04 | 0.0015695529 |
| f__Microbacteriaceae | 3.199134e-04 | 0.0026999464 |
| f__CAG-288 | 2.985206e-04 | 0.0017514912 |
| f__Streptococcaceae | 2.479473e-04 | 0.0018438849 |
| f__Anaplasmataceae | 2.435113e-04 | 0.0021508694 |
| f__Hepatoplasmataceae | 2.362221e-04 | 0.0024205565 |
| f__Aeromonadaceae | 2.327082e-04 | 0.0022486997 |
| f__Peptostreptococcaceae | 1.412306e-04 | 0.0014471826 |
| f__Rhodobacteraceae | 1.371859e-04 | 0.0014057373 |
| f__Xanthomonadaceae | 9.245878e-05 | 0.0009474206 |
| f__Synergistaceae | 7.840493e-05 | 0.0008034115 |
| f__Lactobacillaceae | 4.203166e-05 | 0.0004306963 |
| f__Turicibacteraceae | 0.000000e+00 | 0.0000000000 |
family_arrange <- family_summary %>%
group_by(family) %>%
summarise(mean=sum(relabun)) %>%
arrange(-mean) %>%
select(family) %>%
pull()
family_summary %>%
left_join(genome_metadata %>% select(family,phylum) %>% unique(),by=join_by(family==family)) %>%
left_join(sample_metadata,by=join_by(sample==EHI_number)) %>%
filter(family %in% family_arrange[1:20]) %>%
mutate(family=factor(family,levels=rev(family_arrange[1:20]))) %>%
filter(relabun > 0) %>%
ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
scale_color_manual(values=phylum_colors[-8]) +
geom_jitter(alpha=0.5) +
theme_minimal() +
labs(y="Family", x="Relative abundance", color="Phylum")6.2.2 Genus
genus_summary <- genome_counts_filt %>%
mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>% #append sample metadata
left_join(genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
group_by(sample,genus) %>%
summarise(relabun=sum(count)) %>%
filter(genus != "g__")
genus_summary %>%
group_by(genus) %>%
summarise(mean=mean(relabun),sd=sd(relabun)) %>%
arrange(-mean) %>%
tt()| genus | mean | sd |
|---|---|---|
| g__Bacteroides | 1.725925e-01 | 0.0994470682 |
| g__Parabacteroides | 1.103381e-01 | 0.0767310570 |
| g__Roseburia | 5.077999e-02 | 0.0743624266 |
| g__Phocaeicola | 3.874581e-02 | 0.0435882803 |
| g__JAAYNV01 | 3.526845e-02 | 0.0735025649 |
| g__Odoribacter | 3.376557e-02 | 0.0254845145 |
| g__Helicobacter_J | 2.983823e-02 | 0.0400574428 |
| g__CAG-95 | 1.659217e-02 | 0.0240471187 |
| g__Alistipes | 1.555908e-02 | 0.0156740595 |
| g__Kineothrix | 1.544351e-02 | 0.0584815409 |
| g__MGBC136627 | 1.451743e-02 | 0.0230914933 |
| g__Mycoplasmoides | 1.139564e-02 | 0.0269054026 |
| g__Hungatella_A | 1.113992e-02 | 0.0670243947 |
| g__Anaerotruncus | 1.006956e-02 | 0.0112599653 |
| g__Velocimicrobium | 9.687390e-03 | 0.0159382742 |
| g__Enterocloster | 9.506772e-03 | 0.0098451213 |
| g__Fusobacterium_A | 9.053249e-03 | 0.0370456782 |
| g__Acetatifactor | 8.989245e-03 | 0.0139417426 |
| g__Akkermansia | 8.677367e-03 | 0.0108836065 |
| g__Clostridium_Q | 7.770136e-03 | 0.0138182274 |
| g__Bilophila | 7.347777e-03 | 0.0088824113 |
| g__Lawsonia | 7.130440e-03 | 0.0356135290 |
| g__Intestinimonas | 6.565049e-03 | 0.0061282758 |
| g__Lacrimispora | 6.397950e-03 | 0.0078443852 |
| g__Lachnotalea | 5.679000e-03 | 0.0086820363 |
| g__Desulfovibrio | 5.604601e-03 | 0.0085133819 |
| g__MGBC140009 | 5.533083e-03 | 0.0134005650 |
| g__Extibacter | 5.301402e-03 | 0.0438086011 |
| g__Coprobacillus | 5.255707e-03 | 0.0160178991 |
| g__Ventrimonas | 5.038137e-03 | 0.0083320432 |
| g__NHYM01 | 4.907573e-03 | 0.0427691471 |
| g__Dielma | 4.863951e-03 | 0.0065480876 |
| g__Eisenbergiella | 4.698317e-03 | 0.0069699563 |
| g__CHH4-2 | 4.463440e-03 | 0.0044701587 |
| g__RGIG4733 | 4.252340e-03 | 0.0103379868 |
| g__Negativibacillus | 4.038946e-03 | 0.0054742770 |
| g__Thomasclavelia | 3.858196e-03 | 0.0117540652 |
| g__Hungatella | 3.440712e-03 | 0.0046402483 |
| g__C-19 | 3.334625e-03 | 0.0097460323 |
| g__Citrobacter | 3.199667e-03 | 0.0207814167 |
| g__UMGS1251 | 2.883830e-03 | 0.0066485409 |
| g__Oscillibacter | 2.697945e-03 | 0.0038453712 |
| g__CAZU01 | 2.687821e-03 | 0.0062923086 |
| g__Breznakia | 2.569391e-03 | 0.0076690515 |
| g__Copromonas | 2.559264e-03 | 0.0037548588 |
| g__Mailhella | 2.486840e-03 | 0.0034349591 |
| g__Pseudoflavonifractor | 2.349829e-03 | 0.0028250595 |
| g__Intestinibacillus | 2.290036e-03 | 0.0027696754 |
| g__Escherichia | 2.232632e-03 | 0.0160096591 |
| g__Brevinema | 2.191263e-03 | 0.0097068603 |
| g__MGBC165282 | 2.146060e-03 | 0.0051869977 |
| g__Rikenella | 2.123615e-03 | 0.0033608364 |
| g__Morganella | 2.019952e-03 | 0.0206983454 |
| g__Robinsoniella | 2.000942e-03 | 0.0198167695 |
| g__Parabacteroides_B | 1.950084e-03 | 0.0064473560 |
| g__Hafnia | 1.939718e-03 | 0.0112340944 |
| g__Fluviibacter | 1.936615e-03 | 0.0194735479 |
| g__JAIHAL01 | 1.909337e-03 | 0.0043937983 |
| g__CAJLXD01 | 1.809579e-03 | 0.0041385093 |
| g__Marseille-P3106 | 1.729807e-03 | 0.0025326958 |
| g__UBA866 | 1.535703e-03 | 0.0026149730 |
| g__MGBC116941 | 1.433278e-03 | 0.0090996222 |
| g__Duncaniella | 1.426611e-03 | 0.0146184139 |
| g__Stoquefichus | 1.418761e-03 | 0.0046100976 |
| g__Limenecus | 1.393591e-03 | 0.0029887106 |
| g__JAAYQI01 | 1.271605e-03 | 0.0020016472 |
| g__RGIG6463 | 1.268390e-03 | 0.0028982477 |
| g__Lawsonibacter | 1.177407e-03 | 0.0016547788 |
| g__Scatousia | 1.174334e-03 | 0.0033329854 |
| g__MGBC101980 | 1.147280e-03 | 0.0043856608 |
| g__Hespellia | 1.102861e-03 | 0.0080033094 |
| g__Clostridium_AQ | 1.075206e-03 | 0.0036972777 |
| g__Tidjanibacter | 1.062223e-03 | 0.0029392145 |
| g__Fournierella | 1.021882e-03 | 0.0022586979 |
| g__Eggerthella | 9.946416e-04 | 0.0031801193 |
| g__OM05-12 | 9.566920e-04 | 0.0022952664 |
| g__CALXRO01 | 9.555497e-04 | 0.0060396607 |
| g__CALURL01 | 9.088777e-04 | 0.0021653921 |
| g__Harryflintia | 8.906013e-04 | 0.0020436518 |
| g__MGBC133411 | 8.872215e-04 | 0.0021982622 |
| g__Scatacola_A | 8.685187e-04 | 0.0028129788 |
| g__JALFVM01 | 8.392923e-04 | 0.0020111922 |
| g__Bacteroides_G | 8.358400e-04 | 0.0024999742 |
| g__Ventrisoma | 8.334289e-04 | 0.0015664780 |
| g__CAG-269 | 8.185801e-04 | 0.0029513181 |
| g__IOR16 | 8.132853e-04 | 0.0024147185 |
| g__CAG-873 | 7.794020e-04 | 0.0079864937 |
| g__Buttiauxella | 7.562352e-04 | 0.0062186311 |
| g__14-2 | 7.242943e-04 | 0.0014229865 |
| g__Ureaplasma | 7.069544e-04 | 0.0022811697 |
| g__Scatocola | 6.903112e-04 | 0.0024144468 |
| g__Dysosmobacter | 6.880800e-04 | 0.0012817741 |
| g__Muricomes | 6.843281e-04 | 0.0023723812 |
| g__Anaerovorax | 6.779999e-04 | 0.0017521412 |
| g__UBA7185 | 6.690992e-04 | 0.0018509746 |
| g__Butyricimonas | 6.467068e-04 | 0.0016133034 |
| g__MGBC131033 | 6.443020e-04 | 0.0015998136 |
| g__Evtepia | 6.309865e-04 | 0.0010453519 |
| g__CAJMNU01 | 6.281598e-04 | 0.0008908563 |
| g__Beduini | 5.914594e-04 | 0.0013148807 |
| g__Muribaculum | 5.550053e-04 | 0.0056871120 |
| g__Scandinavium | 5.392506e-04 | 0.0034848792 |
| g__Lactonifactor | 5.340135e-04 | 0.0013169750 |
| g__CALTSX01 | 5.198677e-04 | 0.0053270584 |
| g__CAG-485 | 5.128011e-04 | 0.0052546481 |
| g__Merdicola | 5.109908e-04 | 0.0018183944 |
| g__Ventrenecus | 4.954074e-04 | 0.0024443063 |
| g__UMGS1202 | 4.885966e-04 | 0.0016971062 |
| g__Copranaerobaculum | 4.827706e-04 | 0.0029167211 |
| g__NSJ-61 | 4.574747e-04 | 0.0014044413 |
| g__Faecimonas | 4.467204e-04 | 0.0017621834 |
| g__RGIG8482 | 4.415175e-04 | 0.0020991286 |
| g__Faecivivens | 4.313538e-04 | 0.0008927244 |
| g__RGIG9287 | 4.228193e-04 | 0.0021002237 |
| g__Sarcina | 4.212744e-04 | 0.0029132915 |
| g__Blautia_A | 4.144742e-04 | 0.0010034450 |
| g__Scatenecus | 4.103021e-04 | 0.0026534948 |
| g__Phascolarctobacterium | 4.004438e-04 | 0.0015777220 |
| g__Raoultibacter | 3.828831e-04 | 0.0011609168 |
| g__Caccovivens | 3.719350e-04 | 0.0014704441 |
| g__CAJTFG01 | 3.690163e-04 | 0.0010223015 |
| g__HGM11386 | 3.642876e-04 | 0.0016060882 |
| g__CAG-465 | 3.497207e-04 | 0.0015695529 |
| g__Amedibacillus | 3.457091e-04 | 0.0023834921 |
| g__Enterococcus_A | 3.276668e-04 | 0.0023252026 |
| g__UMGS2016 | 3.212305e-04 | 0.0012912770 |
| g__Emergencia | 3.205926e-04 | 0.0009702487 |
| g__Holdemania | 3.098282e-04 | 0.0010287045 |
| g__Blautia | 3.095400e-04 | 0.0011077317 |
| g__Protoclostridium | 3.067429e-04 | 0.0010837140 |
| g__Fimivivens | 3.043536e-04 | 0.0008104254 |
| g__RGIG7389 | 2.985465e-04 | 0.0005848732 |
| g__CAG-345 | 2.985206e-04 | 0.0017514912 |
| g__UBA7173 | 2.963722e-04 | 0.0030369115 |
| g__Bariatricus | 2.928468e-04 | 0.0008379528 |
| g__Agathobaculum | 2.787287e-04 | 0.0016389968 |
| g__CALXDZ01 | 2.660973e-04 | 0.0006132220 |
| g__UBA940 | 2.621830e-04 | 0.0009075589 |
| g__Microbacterium | 2.563077e-04 | 0.0026263723 |
| g__Aminipila | 2.506050e-04 | 0.0007479203 |
| g__Lactococcus | 2.479473e-04 | 0.0018438849 |
| g__Wolbachia | 2.435113e-04 | 0.0021508694 |
| g__Paramuribaculum | 2.432436e-04 | 0.0024925056 |
| g__Hepatoplasma | 2.362221e-04 | 0.0024205565 |
| g__Aeromonas | 2.327082e-04 | 0.0022486997 |
| g__WRHT01 | 2.186196e-04 | 0.0006955370 |
| g__Zhenpiania | 2.116559e-04 | 0.0012708329 |
| g__UBA5026 | 2.112884e-04 | 0.0009778887 |
| g__UMGS1663 | 1.999591e-04 | 0.0007286034 |
| g__MGBC107952 | 1.752651e-04 | 0.0009887232 |
| g__CALXEL01 | 1.725652e-04 | 0.0013728689 |
| g__CAG-273 | 1.482564e-04 | 0.0007864691 |
| g__Clostridioides | 1.412306e-04 | 0.0014471826 |
| g__Paracoccus | 1.371859e-04 | 0.0014057373 |
| g__JAAWBF01 | 1.286984e-04 | 0.0006144643 |
| g__JAFLTL01 | 1.270703e-04 | 0.0013020827 |
| g__Bacteroides_H | 1.267213e-04 | 0.0012985068 |
| g__RUG12867 | 9.254620e-05 | 0.0006129332 |
| g__Stenotrophomonas | 9.245878e-05 | 0.0009474206 |
| g__Rahnella | 8.365618e-05 | 0.0008059384 |
| g__Lumbricidophila | 6.360575e-05 | 0.0006517650 |
| g__UBA3263 | 5.098641e-05 | 0.0005224553 |
| g__Fructobacillus | 4.203166e-05 | 0.0004306963 |
| g__Clostridium | 0.000000e+00 | 0.0000000000 |
| g__Turicibacter | 0.000000e+00 | 0.0000000000 |
genus_arrange <- genus_summary %>%
group_by(genus) %>%
summarise(mean=sum(relabun)) %>%
filter(genus != "g__")%>%
arrange(-mean) %>%
select(genus) %>%
mutate(genus= sub("^g__", "", genus)) %>%
pull()
genus_summary %>%
left_join(genome_metadata %>% select(genus,phylum) %>% unique(),by=join_by(genus==genus)) %>%
left_join(sample_metadata,by=join_by(sample==EHI_number)) %>%
mutate(genus= sub("^g__", "", genus)) %>%
filter(genus %in% genus_arrange[1:20]) %>%
mutate(genus=factor(genus,levels=rev(genus_arrange[1:20]))) %>%
filter(relabun > 0) %>%
ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
scale_color_manual(values=phylum_colors[-c(3,4,6,8)]) +
geom_jitter(alpha=0.5) +
theme_minimal() +
labs(y="Family", x="Relative abundance", color="Phylum")7 Diversity analyses
7.1 Alpha diversity
# Calculate Hill numbers
richness <- genome_counts_filt %>%
column_to_rownames(var = "genome") %>%
dplyr::select(where(~ !all(. == 0))) %>%
hilldiv(., q = 0) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(richness = 1) %>%
rownames_to_column(var = "sample")
neutral <- genome_counts_filt %>%
column_to_rownames(var = "genome") %>%
dplyr::select(where(~ !all(. == 0))) %>%
hilldiv(., q = 1) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(neutral = 1) %>%
rownames_to_column(var = "sample")
phylogenetic <- genome_counts_filt %>%
column_to_rownames(var = "genome") %>%
dplyr::select(where(~ !all(. == 0))) %>%
hilldiv(., q = 1, tree = genome_tree) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(phylogenetic = 1) %>%
rownames_to_column(var = "sample")
# Aggregate basal GIFT into elements
#Get list of present MAGs
present_MAGs <- genome_counts_filt %>%
column_to_rownames(var = "genome") %>%
filter(rowSums(.[, -1]) != 0) %>%
rownames()
#Align KEGG annotations with present MAGs and remove all-zero and all-one traits
present_MAGs <- present_MAGs[present_MAGs %in% rownames(genome_gifts)]
genome_gifts_filt <- genome_gifts[present_MAGs,] %>%
select_if(~!all(. == 0)) %>% #remove all-zero modules
select_if(~!all(. == 1)) #remove all-one modules
#Filter count table to only contain present MAGs after KEGG filtering
genome_counts_filt_filt <- genome_counts_filt %>%
column_to_rownames(var = "genome")
genome_counts_filt_filt <- genome_counts_filt_filt[present_MAGs,]
dist <- genome_gifts_filt %>%
to.elements(., GIFT_db) %>%
traits2dist(., method = "gower")
functional <- genome_counts_filt_filt %>%
#column_to_rownames(var = "genome") %>%
#dplyr::select(where(~ !all(. == 0))) %>%
hilldiv(., q = 1, dist = dist) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(functional = 1) %>%
rownames_to_column(var = "sample") %>%
mutate(functional = if_else(is.nan(functional), 1, functional))
# Merge all metrics
alpha_div <- richness %>%
full_join(neutral, by = join_by(sample == sample)) %>%
full_join(phylogenetic, by = join_by(sample == sample)) %>%
full_join(functional, by = join_by(sample == sample))alpha_div %>%
pivot_longer(-sample, names_to = "alpha", values_to = "value") %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
group_by(alpha)%>%
summarise(
Aisa_mean=mean(value[Transect=="Aisa"], na.rm=T),
Aisa_sd=sd(value[Transect=="Aisa"], na.rm=T),
Aran_mean=mean(value[Transect=="Aran"], na.rm=T),
Aran_sd=sd(value[Transect=="Aran"], na.rm=T),
Sentein_mean=mean(value[Transect=="Sentein"], na.rm=T),
Sentein_sd=sd(value[Transect=="Sentein"], na.rm=T),
Tourmalet_mean=mean(value[Transect=="Tourmalet"], na.rm=T),
Tourmalet_sd=sd(value[Transect=="Tourmalet"], na.rm=T))%>%
mutate(
Aisa=str_c(round(Aisa_mean,2),"±",round(Aisa_sd,2)),
Aran=str_c(round(Aran_mean,2),"±",round(Aran_sd,2)),
Sentein=str_c(round(Sentein_mean,2),"±",round(Sentein_sd,2)),
Tourmalet=str_c(round(Tourmalet_mean,2),"±",round(Tourmalet_sd,2))) %>%
arrange(-Aisa_mean) %>%
dplyr::select(alpha,Aisa,Aran,Sentein,Tourmalet) %>%
tt()| alpha | Aisa | Aran | Sentein | Tourmalet |
|---|---|---|---|---|
| richness | 198.77±65.73 | 153.32±72.26 | 257.32±71.08 | 245.04±68.73 |
| neutral | 87.81±36.55 | 71.13±36.69 | 110.98±45.84 | 101.5±43.7 |
| phylogenetic | 4.98±1.13 | 4.81±1.12 | 5.41±0.86 | 5.29±0.9 |
| functional | 1.5±0.05 | 1.5±0.06 | 1.52±0.03 | 1.5±0.04 |
alpha_div %>%
pivot_longer(-sample, names_to = "metric", values_to = "value") %>%
left_join(., sample_metadata, by = join_by(sample == EHI_number)) %>%
mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic","functional"))) %>%
ggplot(aes(y = value, x = Transect, group=Transect, color=Transect, fill=Transect)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(alpha=0.5) +
scale_color_manual(name="Transect",
breaks=c("Aisa","Aran","Sentein","Tourmalet"),
labels=c("Aisa","Aran","Sentein","Tourmalet"),
values=c("#e5bd5b", "#6b7398","#e2815a", "#876b96")) +
scale_fill_manual(name="Transect",
breaks=c("Aisa","Aran","Sentein","Tourmalet"),
labels=c("Aisa","Aran","Sentein","Tourmalet"),
values=c("#e5bd5b50", "#6b739850","#e2815a50", "#876b9650")) +
facet_wrap(. ~ metric, scales = "free", ncol=4) +
coord_cartesian(xlim = c(1, NA)) +
stat_compare_means(size=2) +
theme_classic() +
theme(
strip.background = element_blank(),
panel.grid.minor.x = element_line(size = .1, color = "grey"),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
ylab("Alpha diversity") 7.1.1 Regression plots
7.1.1.1 Richness diversity
columns <- c("richness","neutral","phylo","func","mapped","total")
alpha_div %>%
select(sample,richness) %>%
pivot_longer(-sample, names_to = "data", values_to = "value") %>%
mutate(data = factor(data, levels = columns)) %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
ggplot(aes(x = Elevation, y = value)) +
geom_point() +
geom_smooth(method = lm) +
facet_wrap(~ factor(Transect))+
labs(x = "Elevation (m)")7.1.1.2 Neutral diversity
alpha_div %>%
select(sample,neutral) %>%
pivot_longer(-sample, names_to = "data", values_to = "value") %>%
mutate(data = factor(data, levels = columns)) %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
ggplot(aes(x = Elevation, y = value)) +
geom_point() +
geom_smooth(method = lm) +
facet_wrap(~ factor(Transect))+
labs(x = "Elevation (m)")7.1.1.3 Phylogenetic diversity
alpha_div %>%
select(sample,phylogenetic) %>%
pivot_longer(-sample, names_to = "data", values_to = "value") %>%
mutate(data = factor(data, levels = columns)) %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
ggplot(aes(x = Elevation, y = value)) +
geom_point() +
geom_smooth(method = lm) +
facet_wrap(~ factor(Transect))+
labs(x = "Elevation (m)")7.1.1.4 Functional diversities
alpha_div %>%
select(sample,functional) %>%
pivot_longer(-sample, names_to = "data", values_to = "value") %>%
mutate(data = factor(data, levels = columns)) %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
ggplot(aes(x = Elevation, y = value)) +
geom_point() +
geom_smooth(method = lm) +
facet_wrap(~ factor(Transect))+
labs(x = "Elevation (m)")7.1.2 Mixed models
7.1.2.1 Richness diversity
richness_cor<-alpha_div %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
lmerTest::lmer(richness ~ log(sequencing_depth) + Elevation*Transect + (1 | Sampling_point), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | -1.556876e+03 | 213.86817441 | -7.279608 | 102.67218 | 6.967323e-11 |
| fixed | NA | log(sequencing_depth) | 1.089104e+02 | 11.22012642 | 9.706700 | 101.19457 | 3.904741e-16 |
| fixed | NA | Elevation | -5.945361e-02 | 0.05333190 | -1.114785 | 19.97815 | 2.781739e-01 |
| fixed | NA | TransectAran | -1.456675e+02 | 95.32890330 | -1.528052 | 19.77139 | 1.423407e-01 |
| fixed | NA | TransectSentein | -1.282416e+02 | 98.14650386 | -1.306634 | 20.96734 | 2.054881e-01 |
| fixed | NA | TransectTourmalet | -1.937802e+02 | 91.46019166 | -2.118739 | 20.50091 | 4.650671e-02 |
| fixed | NA | Elevation:TransectAran | 9.264522e-02 | 0.06209640 | 1.491958 | 19.59034 | 1.516391e-01 |
| fixed | NA | Elevation:TransectSentein | 1.120611e-01 | 0.06399044 | 1.751216 | 21.21597 | 9.435497e-02 |
| fixed | NA | Elevation:TransectTourmalet | 1.204621e-01 | 0.05764375 | 2.089769 | 20.55707 | 4.926134e-02 |
| ran_pars | Sampling_point | sd__(Intercept) | 1.411515e+01 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 4.636663e+01 | NA | NA | NA | NA |
7.1.2.2 Neutral diversity
neutral_cor<-alpha_div %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
lmerTest::lmer(neutral ~ log(sequencing_depth) + Elevation*Transect + (1|Sampling_point), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | -503.46092882 | 155.88510189 | -3.229692 | 102.94139 | 1.663714e-03 |
| fixed | NA | log(sequencing_depth) | 38.53316355 | 8.30102716 | 4.641975 | 102.38427 | 1.026312e-05 |
| fixed | NA | Elevation | -0.04029799 | 0.03545720 | -1.136525 | 17.82419 | 2.707869e-01 |
| fixed | NA | TransectAran | -69.54734931 | 63.33417863 | -1.098101 | 17.59583 | 2.869598e-01 |
| fixed | NA | TransectSentein | -73.46205358 | 65.46915017 | -1.122087 | 19.00703 | 2.758013e-01 |
| fixed | NA | TransectTourmalet | -97.07019202 | 60.91625317 | -1.593502 | 18.45946 | 1.280264e-01 |
| fixed | NA | Elevation:TransectAran | 0.04308459 | 0.04122860 | 1.045017 | 17.37945 | 3.103332e-01 |
| fixed | NA | Elevation:TransectSentein | 0.05934449 | 0.04271777 | 1.389222 | 19.28043 | 1.806027e-01 |
| fixed | NA | Elevation:TransectTourmalet | 0.05957175 | 0.03839934 | 1.551374 | 18.50873 | 1.377430e-01 |
| ran_pars | Sampling_point | sd__(Intercept) | 6.64785214 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 34.74128890 | NA | NA | NA | NA |
7.1.2.3 Phylogenetic diversity
phylo_cor<-alpha_div %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
lmerTest::lmer(phylogenetic ~ log(sequencing_depth) + Elevation*Transect + (1 | Sampling_point), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 7.9723798522 | 4.1857148123 | 1.9046639 | 103.16183 | 0.05960912 |
| fixed | NA | log(sequencing_depth) | -0.0638680767 | 0.2249894059 | -0.2838715 | 103.46820 | 0.77707614 |
| fixed | NA | Elevation | -0.0012366705 | 0.0008881157 | -1.3924655 | 18.17137 | 0.18058793 |
| fixed | NA | TransectAran | -0.7649418524 | 1.5853816744 | -0.4824970 | 17.90442 | 0.63529916 |
| fixed | NA | TransectSentein | -1.7363345713 | 1.6449704378 | -1.0555415 | 19.63947 | 0.30399034 |
| fixed | NA | TransectTourmalet | -3.2228577094 | 1.5284405960 | -2.1085921 | 18.97101 | 0.04850288 |
| fixed | NA | Elevation:TransectAran | 0.0002832543 | 0.0010313958 | 0.2746320 | 17.63476 | 0.78679044 |
| fixed | NA | Elevation:TransectSentein | 0.0014224302 | 0.0010740261 | 1.3243907 | 19.95532 | 0.20034248 |
| fixed | NA | Elevation:TransectTourmalet | 0.0022220955 | 0.0009635780 | 2.3060877 | 19.01432 | 0.03253476 |
| ran_pars | Sampling_point | sd__(Intercept) | 0.0488567254 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 0.9517063007 | NA | NA | NA | NA |
7.1.2.4 Functional diversities
func_cor<-alpha_div %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
lmerTest::lmer(functional ~ log(sequencing_depth) + Elevation*Transect + (1 | Sampling_point), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1.604632e+00 | 2.146120e-01 | 7.47689687 | 105 | 2.400124e-11 |
| fixed | NA | log(sequencing_depth) | -5.072130e-03 | 1.154428e-02 | -0.43936293 | 105 | 6.613015e-01 |
| fixed | NA | Elevation | -1.096272e-05 | 4.526467e-05 | -0.24219152 | 105 | 8.091042e-01 |
| fixed | NA | TransectAran | 2.877083e-02 | 8.079764e-02 | 0.35608504 | 105 | 7.224913e-01 |
| fixed | NA | TransectSentein | -5.637786e-03 | 8.386423e-02 | -0.06722516 | 105 | 9.465303e-01 |
| fixed | NA | TransectTourmalet | -6.778434e-02 | 7.791307e-02 | -0.86999962 | 105 | 3.862852e-01 |
| fixed | NA | Elevation:TransectAran | -2.198663e-05 | 5.256109e-05 | -0.41830623 | 105 | 6.765775e-01 |
| fixed | NA | Elevation:TransectSentein | 1.445851e-05 | 5.475949e-05 | 0.26403659 | 105 | 7.922692e-01 |
| fixed | NA | Elevation:TransectTourmalet | 4.317958e-05 | 4.911932e-05 | 0.87907520 | 105 | 3.813678e-01 |
| ran_pars | Sampling_point | sd__(Intercept) | 0.000000e+00 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 4.887921e-02 | NA | NA | NA | NA |
7.2 Beta diversity
beta_q0n <- genome_counts_filt %>%
column_to_rownames(., "genome") %>%
hillpair(., q = 0)
beta_q1n <- genome_counts_filt %>%
column_to_rownames(., "genome") %>%
hillpair(., q = 1)
beta_q1p <- genome_counts_filt %>%
column_to_rownames(., "genome") %>%
hillpair(., q = 1, tree = genome_tree)
beta_q1f <- genome_counts_filt_filt %>%
#column_to_rownames(., "genome") %>%
hillpair(., q = 1, dist = dist)8 HMSC set-up
8.2 Subsetting
# Subset by prevalence (present in more than 5 samples)
selected_genomes1 <- genome_counts %>%
filter(rowSums(across(starts_with("EHI")) != 0) >= 5) %>%
select(genome) %>% pull()
# Subset by minimum representation of 1% relative abundance in 5 samples
selected_genomes2 <- genome_counts %>%
filter(genome %in% selected_genomes1) %>%
column_to_rownames(var="genome") %>%
tss() %>%
as.data.frame() %>%
filter(rowSums(across(starts_with("EHI")) >= 0.01) >= 5) %>%
rownames()
# Subset genome metadata
genome_metadata_subset <- genome_metadata %>%
filter(genome %in% selected_genomes2)# Random effects data (study design)
StudyDesign <- sample_metadata %>%
select(EHI_number,Sampling_point,Transect, Elevation) %>%
mutate(Sampling_point = factor(Sampling_point)) %>%
mutate(Transect = factor(Transect)) %>%
mutate(Elevation = factor(Elevation)) %>%
column_to_rownames("EHI_number")
# Genome count table (quantitative community data)
YData <- read_counts %>%
filter(genome %in% selected_genomes2) %>% #subset genomes
mutate(across(where(is.numeric), ~ . +1 )) %>% #add +1 pseudocount to remove zeros
mutate(across(where(is.numeric), ~ . / (genome_metadata_subset$length / 150) )) %>% #transform to genome counts
mutate(across(where(is.numeric), ~ log(.) )) %>% #log-transform
column_to_rownames("genome") %>%
select(all_of(row.names(StudyDesign))) %>% #filter only faecal samples
as.data.frame() %>%
t() # transpose
# Fixed effects data (explanatory variables)
XData <- sample_metadata %>%
select(EHI_number,Elevation,Tree_cover, Anthropization_cover) %>%
mutate(logseqdepth=read_counts %>% #total log-sequencing depth
select(all_of(row.names(StudyDesign))) %>%
colSums() %>%
log()
) %>%
column_to_rownames("EHI_number")
# Genome trait data (not needed now, and it gives error in to.functions section)
#TrData <- genome_gifts %>%
#arrange(match(genome, colnames(YData))) %>%
#column_to_rownames(var="genome") %>%
#to.functions(.,GIFT_db) %>%
#as.data.frame()
# Genome phylogeny
PData <- genome_tree8.4 Define and Hmsc models
#Define models
model1 = Hmsc(Y=YData,
XData = XData,
XFormula = XFormula1,
studyDesign = StudyDesign,
phyloTree = PData,
ranLevels = list("Sampling_point"=rL.sampling_point, "Transect"=rL.transect),
distr = "normal",
YScale = TRUE)
#Save list of models as an R object.
model_list = list(model1=model1)
if (!dir.exists("hmsc")){dir.create("hmsc")}
save(model_list, file = "hmsc/hmsc.Rdata")Upload hmsc/hmsc.Rdata to the HPC respecting the directory structure.
8.6 Generate Hmsc executables
The next chunk generates shell files for every combination of model, MCMC samples and MCMM thinning, ready to be launched as SLURM jobs.
modelchains <- expand.grid(model = names(model_list), sample = MCMC_samples_list, thin = MCMC_thin_list)
if (!dir.exists("hmsc")){dir.create("hmsc")}
for(i in c(1:nrow(modelchains))){
modelname=as.character(modelchains[i,1])
sample=modelchains[i,2]
thin=modelchains[i,3]
executablename <- paste0("hmsc/exe_",modelname,"_",sample,"_",thin,".sh")
fitname <- paste0("hmsc/fit_",modelname,"_",sample,"_",thin,".Rdata")
convname <- paste0("hmsc/conv_",modelname,"_",sample,"_",thin,".Rdata")
model <- paste0('model_list$',modelname)
psrf.beta.name <- paste0("psrf.beta.",modelname,"_",sample,"_",thin)
psrf.gamma.name <- paste0("psrf.gamma.",modelname,"_",sample,"_",thin)
psrf.rho.name <- paste0("psrf.rho.",modelname,"_",sample,"_",thin)
jobname <- paste0("hmsc_",modelname,"_",sample,"_",thin)
minutes <- round(sample * thin * (ncol(YData)/500), 0)
code <- sprintf("#!/bin/bash
#SBATCH --job-name=%s # Job name
#SBATCH --nodes=1
#SBATCH --ntasks=4 # Run on 4 CPUs
#SBATCH --mail-user=garazi.bideguren@sund.ku.dk
#SBATCH --mem=800gb # Job memory request
#SBATCH --time=%d # In minutes
# Activate conda environment
module load mamba/1.3.1
source activate /maps/projects/mjolnir1/people/dlz554/hmsc_env
# Run R script
Rscript -e '
library(tidyverse)
library(Hmsc)
# Load formulas and data
load(\"hmsc/hmsc.Rdata\")
# Declare placeholders
modelname = \"%s\"
model = %s
fitname = \"%s\"
convname = \"%s\"
sample = %d
thin = %d
nchains = %d
# Run model fitting
m = sampleMcmc(hM = model,
samples = sample,
thin = thin,
adaptNf=rep(ceiling(0.4*sample*thin),model$nr),
transient = ceiling(0.5*sample*thin),
nChains = nchains,
nParallel = nchains)
# Assess chain convergence
mpost = convertToCodaObject(m,
spNamesNumbers = c(T,F),
covNamesNumbers = c(T,F),
Beta = TRUE,
Gamma = TRUE,
V = FALSE,
Sigma = FALSE,
Rho = TRUE,
Eta = FALSE,
Lambda = FALSE,
Alpha = FALSE,
Omega = FALSE,
Psi = FALSE,
Delta = FALSE) # Convert to CODA object
# Fixed effects
assign(paste0(\"psrf.beta.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Beta,multivariate=FALSE)$psrf)
# Traits
assign(paste0(\"psrf.gamma.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Gamma,multivariate=FALSE)$psrf)
# Phylogeny
assign(paste0(\"psrf.rho.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Rho,multivariate=FALSE)$psrf)
# Write convergence data
save(%s, %s, %s, file=convname)
# Save model fit object
save(m, file=fitname)
'
", jobname, minutes, modelname, model, fitname, convname, sample, thin, nChains, psrf.beta.name, psrf.gamma.name, psrf.rho.name)
writeLines(code, executablename)
}Upload the produced hmsc/exe_XXXXX.sh files to the HPC respecting the directory structure.
8.8 Assess chain convergence
Convergence diagnostic values substantially above 1 indicate lack of convergence. Values below 1.1 are considered good enough
# Load all conv file available in the hmsc folder
file_paths <-list.files(path = "hmsc_bookdown", pattern = "^conv_", full.names = TRUE, include.dirs = TRUE)
for (file_path in file_paths) {
load(file_path, verbose = TRUE) # Remove .GlobalEnv argument and specify verbose for each load operation
}Loading objects:
psrf.beta.model1_250_1
psrf.gamma.model1_250_1
psrf.rho.model1_250_1
Loading objects:
psrf.beta.model1_250_10
psrf.gamma.model1_250_10
psrf.rho.model1_250_10
# Create a merged psrf.beta (genome) plot
ls() %>%
grep("^psrf\\.beta", ., value = TRUE) %>%
map_dfr(~ {
mat <- get(.x)
data.frame(modelchain = .x, as.data.frame(mat, , stringsAsFactors = FALSE)) %>%
rownames_to_column(var="parameter") %>%
mutate(model = str_split(modelchain, "_") %>% map_chr(1) %>% gsub("psrf.beta.","",.)) %>%
mutate(sample = str_split(modelchain, "_")[[1]][2]) %>% #extract sample info from model name
mutate(thin = str_split(modelchain, "_")[[1]][3]) #extract thin info from model name
}) %>%
ggplot(.,aes(x=reorder(modelchain,-Point.est.,fun=function(x) {quantile(x, probs = 0.9)}),y=Point.est.)) +
geom_violin(fill="#b8d9e3", color="#328da8") +
geom_jitter(alpha=0.3,size=0.2, color="#a8babf") +
stat_summary(fun=function(x) {quantile(x, probs = 0.9)}, geom="crossbar", width=0.2, color="orange") +
geom_hline(yintercept=1.1, linetype="dashed", color = "red") +
ylim(0.9,2)+
labs(x="Model chains",y="Parameter estimates")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))9 HMSC analysis
9.2 Compute variance partitioning
# Select modelchain of interest
load("hmsc_bookdown/fit_model1_250_10.Rdata")
varpart=computeVariancePartitioning(m)
# Compute variance partitioning
varpart=computeVariancePartitioning(m)
varpart$vals %>%
as.data.frame() %>%
rownames_to_column(var="variable") %>%
pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
mutate(variable=factor(variable, levels=c("Elevation","Tree_cover", "Anthropization_cover", "logseqdepth","Random: Sampling_point", "Random: Transect"))) %>%
group_by(variable) %>%
summarise(mean=mean(value)*100,sd=sd(value)*100) %>%
tt()| variable | mean | sd |
|---|---|---|
| Elevation | 11.38945 | 10.849982 |
| Tree_cover | 6.97232 | 4.187276 |
| Anthropization_cover | 11.16491 | 5.066755 |
| logseqdepth | 25.34223 | 13.997659 |
| Random: Sampling_point | 27.93211 | 17.075779 |
| Random: Transect | 17.19899 | 12.998778 |
# Basal tree
varpart_tree <- genome_tree %>%
keep.tip(., tip=m$spNames)
#Varpart table
varpart_table <- varpart$vals %>%
as.data.frame() %>%
rownames_to_column(var="variable") %>%
pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
mutate(variable=factor(variable, levels=rev(c("Elevation","Tree_cover","Anthropization_cover", "logseqdepth","Random: Sampling_point", "Random: Transect")))) %>%
mutate(genome=factor(genome, levels=rev(varpart_tree$tip.label)))
#Phylums
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
filter(genome %in% varpart_tree$tip.label) %>%
arrange(match(genome, varpart_tree$tip.label)) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome") %>%
select(phylum)
colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
filter(genome %in% varpart_tree$tip.label) %>%
arrange(match(genome, varpart_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
select(colors) %>%
pull()
# Basal ggtree
varpart_tree <- varpart_tree %>%
force.ultrametric(.,method="extend") %>%
ggtree(., size = 0.3)***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
# Add phylum colors next to the tree tips
varpart_tree <- gheatmap(varpart_tree, phylum_colors, offset=-0.2, width=0.1, colnames=FALSE) +
scale_fill_manual(values=colors_alphabetic)+
labs(fill="Phylum")
#Reset fill scale to use a different colour profile in the heatmap
varpart_tree <- varpart_tree + new_scale_fill()
# Add variance stacked barplot
vertical_tree <- varpart_tree +
scale_fill_manual(values=c("#34738f","#cccccc","#ed8a45","#b2b530","#be3e2b","#83bb90","#f6de6c", "#122f3d"))+
geom_fruit(
data=varpart_table,
geom=geom_bar,
mapping = aes(x=value, y=genome, fill=variable, group=variable),
pwidth = 2,
offset = 0.05,
width= 1,
orientation="y",
stat="identity",
color = "white",
size=0.1)+
labs(fill="Variable")
vertical_tree# Select desired support threshold
support=0.9
negsupport=1-support
# Basal tree
postestimates_tree <- genome_tree %>%
keep.tip(., tip=m$spNames)
#plotBeta(hM=m, post=getPostEstimate(hM=m, parName="Beta"), param = "Support", plotTree = TRUE, covNamesNumbers=c(1,0))
# Posterior estimate table
post_beta <- getPostEstimate(hM=m, parName="Beta")$support %>%
as.data.frame() %>%
mutate(variable=m$covNames) %>%
pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
mutate(genome=factor(genome, levels=rev(postestimates_tree$tip.label))) %>%
mutate(value = case_when(
value >= support ~ "Positive",
value <= negsupport ~ "Negative",
TRUE ~ "Neutral")) %>%
mutate(value=factor(value, levels=c("Positive","Neutral","Negative"))) %>%
pivot_wider(names_from = variable, values_from = value) %>%
rename(intercept=2,
Elevation=3,
Tree_cover=4,
Anthropization_cover=5,
logseqdepth=6
) %>%
select(genome,intercept,Elevation,Tree_cover,Anthropization_cover,logseqdepth) %>%
column_to_rownames(var="genome")
#Phylums
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
filter(genome %in% postestimates_tree$tip.label) %>%
arrange(match(genome, postestimates_tree$tip.label)) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome") %>%
select(phylum)
colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
filter(genome %in% postestimates_tree$tip.label) %>%
arrange(match(genome, postestimates_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
select(colors) %>%
pull()
# Basal ggtree
postestimates_tree <- postestimates_tree %>%
force.ultrametric(.,method="extend") %>%
ggtree(., size = 0.3)***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
#Add phylum colors next to the tree tips
postestimates_tree <- gheatmap(postestimates_tree, phylum_colors, offset=-0.2, width=0.1, colnames=FALSE) +
scale_fill_manual(values=colors_alphabetic)+
labs(fill="Phylum")
#Reset fill scale to use a different colour profile in the heatmap
postestimates_tree <- postestimates_tree + new_scale_fill()
# Add posterior significant heatmap
postestimates_tree <- gheatmap(postestimates_tree, post_beta, offset=0, width=0.5, colnames=TRUE, colnames_position="top",colnames_angle=90, colnames_offset_y=1, hjust=0) +
scale_fill_manual(values=c("#be3e2b","#f4f4f4","#b2b530"))+
labs(fill="Trend")
postestimates_tree +
vexpand(.25, 1) # expand top#Compute the residual correlation matrix
OmegaCor = computeAssociations(m)
# Reference tree (for sorting genomes)
genome_tree_subset <- genome_tree %>%
keep.tip(., tip=m$spNames)
#Co-occurrence matrix at the animal level
supportLevel = 0.95
toPlot = ((OmegaCor[[1]]$support>supportLevel)
+ (OmegaCor[[1]]$support<(1-supportLevel))>0)*OmegaCor[[1]]$mean
matrix <- toPlot %>%
as.data.frame() %>%
rownames_to_column(var="genome1") %>%
pivot_longer(!genome1, names_to = "genome2", values_to = "cor") %>%
mutate(genome1= factor(genome1, levels=genome_tree_subset$tip.label)) %>%
mutate(genome2= factor(genome2, levels=genome_tree_subset$tip.label)) %>%
ggplot(aes(x = genome1, y = genome2, fill = cor)) +
geom_tile() +
scale_fill_gradient2(low = "#be3e2b",
mid = "#f4f4f4",
high = "#b2b530")+
theme_void()
vtree <- genome_tree_subset %>%
force.ultrametric(.,method="extend") %>%
ggtree(.)***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
#create composite figure
grid.arrange(grobs = list(vtree,matrix,vtree),
layout_matrix = rbind(c(4,1,1,1,1,1,1,1,1,1,1,1),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2),
c(3,2,2,2,2,2,2,2,2,2,2,2)))9.3 Elevation predictions
gradient = seq(940, 2350, by = 100)
gradientlength = length(gradient)
#Treatment-specific gradient predictions
pred_elevation <- constructGradient(m,
focalVariable = "Elevation",
non.focalVariables = list(logseqdepth=list(1),Tree_cover=list(1), Anthropization_cover=list(1)),
ngrid=gradientlength) %>%
predict(m, Gradient = ., expected = TRUE) %>%
do.call(rbind,.) %>%
as.data.frame() %>%
mutate(elevation=rep(gradient,1000)) %>%
pivot_longer(-c(elevation), names_to = "genome", values_to = "value")9.3.1 Responses to elevation
# Select desired support threshold
support=0.9
negsupport=1-support
#Get phylum colors from the EHI standard
phylum_colors <- genome_metadata %>%
left_join(read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv"), by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
#slice(2:5) %>%
select(colors) %>%
pull()
getPostEstimate(hM=m, parName="Beta")$support %>%
as.data.frame() %>%
mutate(variable=m$covNames) %>%
pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
mutate(trend = case_when(
value >= support ~ "Positive",
value <= negsupport ~ "Negative",
TRUE ~ "Neutral")) %>%
filter(variable=="Elevation") %>%
select(genome,trend) %>%
left_join(pred_elevation, by=join_by(genome==genome)) %>%
group_by(genome, trend, elevation) %>%
summarize(value = mean(value, na.rm = TRUE)) %>%
left_join(genome_metadata, by=join_by(genome == genome)) %>%
ggplot(aes(x=elevation, y=value, group=genome, color=phylum, linetype=trend)) +
geom_line() +
scale_linetype_manual(values=c("solid","dashed","solid")) +
scale_color_manual(values=phylum_colors) +
facet_grid(fct_rev(trend) ~ phylum) +
labs(y="Genome abundance (log)",x="Elevation") +
theme(legend.position = "none") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 0.8,),
axis.line.x = element_line(size = 0.5, linetype = "solid", colour = "black"),
)9.4 Functional predictions
9.4.1 Element level
elements_table <- genome_gifts_filt %>%
to.elements(., GIFT_db) %>%
as.data.frame()
community_elements <- pred_elevation %>%
group_by(elevation, genome) %>%
mutate(row_id = row_number()) %>%
pivot_wider(names_from = genome, values_from = value) %>%
ungroup() %>%
group_split(row_id) %>%
as.list() %>%
lapply(., FUN = function(x){x %>%
select(-row_id) %>%
column_to_rownames(var = "elevation") %>%
as.data.frame() %>%
exp() %>%
t() %>%
tss() %>%
to.community(elements_table,.,GIFT_db) %>%
as.data.frame() %>%
rownames_to_column(var="elevation")
})
calculate_slope <- function(x) {
lm_fit <- lm(unlist(x) ~ seq_along(unlist(x)))
coef(lm_fit)[2]
}
element_predictions <- map_dfc(community_elements, function(mat) {
mat %>%
column_to_rownames(var = "elevation") %>%
t() %>%
as.data.frame() %>%
rowwise() %>%
mutate(slope = calculate_slope(c_across(everything()))) %>%
select(slope) }) %>%
t() %>%
as.data.frame() %>%
set_names(colnames(community_elements[[1]])[-1]) %>%
rownames_to_column(var="iteration") %>%
pivot_longer(!iteration, names_to="trait",values_to="value") %>%
group_by(trait) %>%
summarise(mean=mean(value),
p1 = quantile(value, probs = 0.1),
p9 = quantile(value, probs = 0.9),
positive_support = sum(value > 0)/1000,
negative_support = sum(value < 0)/1000) %>%
arrange(-positive_support)# Positively associated
p0<-positive_filtered<-element_predictions %>%
filter(mean >0) %>%
arrange(-positive_support) %>%
filter(positive_support>=0.9)
if (nrow(positive_filtered) > 0) {
positive_filtered %>%
tt() |>
style_tt(
i = which(element_predictions$positive_support < 0.9 & element_predictions$negative_support < 0.1),
background = "#E5D5B1") |>
style_tt(
i = which(element_predictions$negative_support > 0.9 & element_predictions$positive_support < 0.1),
background = "#B7BCCE")
}
#Negatively associated
p1<-negative_filtered<-element_predictions %>%
filter(mean <0) %>%
arrange(-negative_support) %>%
filter(negative_support>=0.9)
if (nrow(negative_filtered) > 0) {
negative_filtered %>%
tt() |>
style_tt(
i = which(element_predictions$positive_support < 0.9 & element_predictions$negative_support < 0.1),
background = "#E5D5B1") |>
style_tt(
i = which(element_predictions$negative_support > 0.9 & element_predictions$positive_support < 0.1),
background = "#B7BCCE")
}| trait | mean | p1 | p9 | positive_support | negative_support |
|---|---|---|---|---|---|
| D0613 | 0.034264683 | 0.0113319085 | 0.060146258 | 0.971 | 0.029 |
| B0106 | 0.017715307 | 0.0039875628 | 0.033388901 | 0.966 | 0.034 |
| D0609 | 0.014654471 | 0.0028010131 | 0.027577745 | 0.962 | 0.038 |
| D0205 | 0.008600377 | 0.0020467751 | 0.015207831 | 0.953 | 0.047 |
| D0207 | 0.031552241 | 0.0069001701 | 0.067737031 | 0.949 | 0.051 |
| B0802 | 0.037714918 | 0.0086104696 | 0.071624679 | 0.947 | 0.053 |
| D0304 | 0.014938085 | 0.0035430830 | 0.027059303 | 0.945 | 0.055 |
| B0214 | 0.019893750 | 0.0026561423 | 0.037824383 | 0.931 | 0.069 |
| B0217 | 0.013929202 | 0.0015844388 | 0.028027942 | 0.925 | 0.075 |
| D0817 | 0.004336948 | 0.0003403666 | 0.008888889 | 0.919 | 0.081 |
| trait | mean | p1 | p9 | positive_support | negative_support |
|---|---|---|---|---|---|
| B0208 | -0.0344477317 | -0.0618098460 | -1.039294e-02 | 0.016 | 0.984 |
| S0104 | -0.0209740208 | -0.0337504690 | -8.006082e-03 | 0.022 | 0.978 |
| D0805 | -0.0004581602 | -0.0009344457 | -6.018417e-05 | 0.035 | 0.965 |
| B0710 | -0.0061948836 | -0.0108430396 | -1.870104e-03 | 0.037 | 0.963 |
| B0310 | -0.0028164249 | -0.0048520436 | -8.904523e-04 | 0.043 | 0.957 |
| B1029 | -0.0003677198 | -0.0007876956 | -4.280235e-05 | 0.045 | 0.955 |
| D0604 | -0.0252076749 | -0.0512963980 | -5.101516e-03 | 0.050 | 0.950 |
| D0903 | -0.0047866601 | -0.0089646580 | -1.108197e-03 | 0.050 | 0.950 |
| B0218 | -0.0116625901 | -0.0223161257 | -2.061981e-03 | 0.053 | 0.947 |
| B0703 | -0.0192156802 | -0.0373477215 | -2.861509e-03 | 0.059 | 0.941 |
| D0206 | -0.0197688096 | -0.0392643553 | -3.454538e-03 | 0.061 | 0.939 |
| D0701 | -0.0051397234 | -0.0101283806 | -6.650890e-04 | 0.068 | 0.932 |
| D0509 | -0.0151983776 | -0.0301644940 | -2.157025e-03 | 0.070 | 0.930 |
| D0103 | -0.0058062776 | -0.0111387262 | -5.611137e-04 | 0.073 | 0.927 |
| B0903 | -0.0040583726 | -0.0082299820 | -3.182127e-04 | 0.078 | 0.922 |
| B0711 | -0.0116387135 | -0.0239457783 | -1.087371e-03 | 0.079 | 0.921 |
| D0307 | -0.0106690266 | -0.0219998777 | -7.160822e-04 | 0.083 | 0.917 |
| B0702 | -0.0164229499 | -0.0340972191 | -1.406339e-03 | 0.085 | 0.915 |
| D0508 | -0.0025923862 | -0.0054092788 | -1.884571e-04 | 0.089 | 0.911 |
| D0908 | -0.0351466114 | -0.0789647021 | -1.049356e-03 | 0.091 | 0.909 |
| D0912 | -0.0009892400 | -0.0020007917 | -1.425600e-05 | 0.092 | 0.908 |
| B0303 | -0.0072114512 | -0.0152002291 | -7.612867e-05 | 0.097 | 0.903 |
#Positively associated
positive <- element_predictions %>%
filter(mean >0) %>%
arrange(mean) %>%
filter(positive_support>=0.9) %>%
select(-negative_support) %>%
rename(support=positive_support)
#Negatively associated
negative <- element_predictions %>%
filter(mean <0) %>%
arrange(mean) %>%
filter(negative_support>=0.9) %>%
select(-positive_support) %>%
rename(support=negative_support)
all_elements <- bind_rows(positive,negative) %>%
left_join(GIFT_db,by=join_by(trait==Code_element)) %>%
mutate(trait=factor(trait,levels=c(rev(positive$trait),rev(negative$trait)))) %>%
mutate(Code_function=factor(Code_function)) %>%
mutate(element_legend=str_c(trait," - ",Element)) %>%
mutate(function_legend=str_c(Code_function," - ",Function)) %>%
select(trait,mean,p1,p9,element_legend,function_legend) %>%
unique()
gift_colors <- read_tsv("data/gift_colors.tsv") %>%
mutate(legend=str_c(Code_function," - ",Function)) %>%
filter(legend %in% all_elements$function_legend)
all_elements %>%
ggplot(aes(x=mean, y=fct_reorder(element_legend, mean), xmin=p1, xmax=p9, color=function_legend)) +
geom_point() +
geom_errorbar() +
xlim(c(-0.1,0.09)) +
geom_vline(xintercept=0) +
scale_color_manual(values = gift_colors$Color) +
theme_minimal() +
labs(x="Regression coefficient",y="Functional trait")community_elements %>%
bind_rows() %>%
pivot_longer(-elevation, names_to = "trait", values_to = "value") %>%
filter(trait %in% positive$trait) %>%
mutate(trait=factor(trait, levels=positive$trait)) %>%
mutate(elevation=as.numeric(elevation)) %>%
ggplot(aes(x=elevation, y=value)) +
geom_smooth(method = lm, formula = y ~ x, se = TRUE) +
#geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), se = TRUE) +
facet_wrap(~trait, ncol=5, scales="free") +
theme_minimal() +
labs(x="Elevation",y="Metabolic Capacity Index")9.4.1.1 GIFT test visualization
# Aggregate bundle-level GIFTs into the compound level
genome_counts_filt_filt <- tibble::rownames_to_column(genome_counts_filt_filt, var = "genome")
GIFTs_elements_filtered <- elements_table[rownames(elements_table) %in% genome_counts_filt_filt$genome, ]
GIFTs_elements_filtered <- as.data.frame(GIFTs_elements_filtered) %>%
select_if(~ !is.numeric(.) || sum(.) != 0)
# Get community-weighed average GIFTs per sample
GIFTs_elements_community <- to.community(GIFTs_elements_filtered, genome_counts_filt_filt %>% column_to_rownames(., "genome") %>% tss(), GIFT_db)
GIFTs_elements_community %>%
as.data.frame() %>%
rownames_to_column(var="sample") %>%
pivot_longer(!sample,names_to="trait",values_to="gift") %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
mutate(functionid = substr(trait, 1, 3)) %>%
mutate(trait = case_when(
trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
TRUE ~ trait
)) %>%
mutate(functionid = case_when(
functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
TRUE ~ functionid
)) %>%
mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
ggplot(aes(x=sample,y=trait,fill=gift)) +
geom_tile(colour="white", linewidth=0.2)+
scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
facet_grid(functionid ~ Elevation, scales="free",space="free") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.text.y = element_text(angle = 0)) +
labs(y="Traits",x="Samples",fill="GIFT")9.4.2 Functional level
functions_table <- elements_table %>%
to.functions(., GIFT_db) %>%
as.data.frame()
community_functions <- pred_elevation %>%
group_by(elevation, genome) %>%
mutate(row_id = row_number()) %>%
pivot_wider(names_from = genome, values_from = value) %>%
ungroup() %>%
group_split(row_id) %>%
as.list() %>%
lapply(., FUN = function(x){x %>%
select(-row_id) %>%
column_to_rownames(var = "elevation") %>%
as.data.frame() %>%
exp() %>%
t() %>%
tss() %>%
to.community(functions_table,.,GIFT_db) %>%
as.data.frame() %>%
rownames_to_column(var="elevation")
})#max-min option
calculate_slope <- function(x) {
lm_fit <- lm(unlist(x) ~ seq_along(unlist(x)))
coef(lm_fit)[2]
}
function_predictions <- map_dfc(community_functions, function(mat) {
mat %>%
column_to_rownames(var = "elevation") %>%
t() %>%
as.data.frame() %>%
rowwise() %>%
mutate(slope = calculate_slope(c_across(everything()))) %>%
select(slope) }) %>%
t() %>%
as.data.frame() %>%
set_names(colnames(community_functions[[1]])[-1]) %>%
rownames_to_column(var="iteration") %>%
pivot_longer(!iteration, names_to="trait",values_to="value") %>%
group_by(trait) %>%
summarise(mean=mean(value),
p1 = quantile(value, probs = 0.1),
p9 = quantile(value, probs = 0.9),
positive_support = sum(value > 0)/1000,
negative_support = sum(value < 0)/1000) %>%
arrange(-positive_support)# Positively associated
p2<-function_predictions %>%
filter(mean >0) %>%
arrange(-positive_support) %>%
filter(positive_support>=0.9) %>%
tt() |>
style_tt(
i = which(function_predictions$positive_support < 0.9 & function_predictions$negative_support < 0.1),
background = "#E5D5B1") |>
style_tt(
i = which(function_predictions$negative_support > 0.9 & function_predictions$positive_support < 0.1),
background = "#B7BCCE")
# Negatively associated (there isn't)
#function_predictions %>%
#filter(mean <0) %>%
#arrange(-negative_support) %>%
#filter(negative_support>=0.9) %>%
#tt() |>
#style_tt(
#i = which(function_predictions$positive_support < 0.9 & function_predictions$negative_support < 0.1),
#background = "#E5D5B1") |>
#style_tt(
#i = which(function_predictions$negative_support > 0.9 & function_predictions$positive_support < 0.1),
#background = "#B7BCCE")| trait | mean | p1 | p9 | positive_support | negative_support |
|---|---|---|---|---|---|
| S03 | 0.026906176 | 0.0104593848 | 0.044524683 | 0.973 | 0.027 |
| B04 | 0.018264988 | 0.0051591436 | 0.033137776 | 0.970 | 0.030 |
| D03 | 0.010497822 | 0.0021459296 | 0.019986199 | 0.943 | 0.057 |
| D06 | 0.003285998 | 0.0004532722 | 0.006338923 | 0.933 | 0.067 |
| B07 | 0.012228797 | 0.0004053295 | 0.024691062 | 0.906 | 0.094 |
all_functions <- function_predictions %>%
left_join(GIFT_db,by=join_by(trait==Code_function)) %>%
mutate(trait=factor(trait)) %>%
mutate(function_legend=str_c(trait," - ",Function)) %>%
select(trait,mean,p1,p9,function_legend) %>%
unique()
gift_colors <- read_tsv("data/gift_colors.tsv") %>%
mutate(legend=str_c(Code_function," - ",Function)) %>%
filter(legend %in% all_functions$function_legend)
all_functions %>%
ggplot(aes(x=mean, y=fct_reorder(function_legend, mean), xmin=p1, xmax=p9, color=function_legend)) +
geom_point() +
geom_errorbar() +
xlim(c(-0.05,0.05)) +
geom_vline(xintercept=0) +
scale_color_manual(values = gift_colors$Color) +
theme_minimal() +
labs(x="Regression coefficient",y="Functional trait") +
guides(col = guide_legend(ncol = 1))community_functions %>%
bind_rows() %>%
pivot_longer(-elevation, names_to = "trait", values_to = "value") %>%
filter(trait %in% function_predictions$trait) %>%
mutate(trait=factor(trait, levels=function_predictions$trait)) %>%
mutate(elevation=as.numeric(elevation)) %>%
ggplot(aes(x=elevation, y=value)) +
geom_smooth(method = lm, formula = y ~ x, se = TRUE) +
#geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), se = TRUE) +
facet_wrap(~trait, ncol=5, scales="free") +
theme_minimal() +
labs(x="Elevation",y="Metabolic Capacity Index")9.4.2.1 GIFT test visualization
# Aggregate element-level GIFTs into the function level
GIFTs_functions <- to.functions(GIFTs_elements_filtered, GIFT_db)
functions <- GIFTs_functions %>%
as.data.frame()
# Get community-weighed average GIFTs per sample
GIFTs_functions_community <- to.community(GIFTs_functions, genome_counts_filt_filt %>% column_to_rownames(., "genome") %>% tss(), GIFT_db)
GIFTs_functions_community %>%
as.data.frame() %>%
rownames_to_column(var="sample") %>%
pivot_longer(!sample,names_to="trait",values_to="gift") %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
ggplot(aes(x=trait,y=sample,fill=gift)) +
geom_tile(colour="white", size=0.2)+
scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
facet_grid(Elevation ~ ., scales="free",space="free")9.4.3 Domain level
domains_table <- functions_table %>%
to.domains(., GIFT_db) %>%
as.data.frame()
community_domains <- pred_elevation %>%
group_by(elevation, genome) %>%
mutate(row_id = row_number()) %>%
pivot_wider(names_from = genome, values_from = value) %>%
ungroup() %>%
group_split(row_id) %>%
as.list() %>%
lapply(., FUN = function(x){x %>%
select(-row_id) %>%
column_to_rownames(var = "elevation") %>%
as.data.frame() %>%
exp() %>%
t() %>%
tss() %>%
to.community(domains_table,.,GIFT_db) %>%
as.data.frame() %>%
rownames_to_column(var="elevation")
})#max-min option
calculate_slope <- function(x) {
lm_fit <- lm(unlist(x) ~ seq_along(unlist(x)))
coef(lm_fit)[2]
}
domain_predictions <- map_dfc(community_domains, function(mat) {
mat %>%
column_to_rownames(var = "elevation") %>%
t() %>%
as.data.frame() %>%
rowwise() %>%
mutate(slope = calculate_slope(c_across(everything()))) %>%
select(slope) }) %>%
t() %>%
as.data.frame() %>%
set_names(colnames(community_domains[[1]])[-1]) %>%
rownames_to_column(var="iteration") %>%
pivot_longer(!iteration, names_to="trait",values_to="value") %>%
group_by(trait) %>%
summarise(mean=mean(value),
p1 = quantile(value, probs = 0.1),
p9 = quantile(value, probs = 0.9),
positive_support = sum(value > 0)/1000,
negative_support = sum(value < 0)/1000) %>%
arrange(-positive_support)# Positively associated
p3<-positive_filtered<-domain_predictions %>%
filter(mean >0) %>%
arrange(-positive_support) %>%
filter(positive_support>=0.9)
if (nrow(positive_filtered) > 0) {
positive_filtered %>%
tt() |>
style_tt(
i = which(domain_predictions$positive_support < 0.9 & domain_predictions$negative_support < 0.1),
background = "#E5D5B1") |>
style_tt(
i = which(domain_predictions$negative_support > 0.9 & domain_predictions$positive_support < 0.1),
background = "#B7BCCE")
}
# Negatively associated (there isn't)
#domain_predictions %>%
#filter(mean <0) %>%
#arrange(-negative_support) %>%
#filter(negative_support>=0.9) %>%
#tt() |>
#style_tt(
#i = which(domain_predictions$positive_support < 0.9 & domain_predictions$negative_support < 0.1),
#background = "#E5D5B1") |>
#style_tt(
#i = which(domain_predictions$negative_support > 0.9 & domain_predictions$positive_support < 0.1),
#background = "#B7BCCE")| trait | mean | p1 | p9 | positive_support | negative_support |
|---|---|---|---|---|---|
| Overall | 0.006863701 | 0.0003939157 | 0.01509651 | 0.917 | 0.083 |
all_domains <- domain_predictions %>%
left_join(GIFT_db,by=join_by(trait==Code_function)) %>%
mutate(trait=factor(trait)) %>%
mutate(function_legend=str_c(trait," - ",Function)) %>%
select(trait,mean,p1,p9) %>%
unique()
all_domains %>%
ggplot(aes(x=mean, y=fct_reorder(trait, mean), xmin=p1, xmax=p9, color=trait)) +
geom_point() +
geom_errorbar() +
xlim(c(-0.02,0.02)) +
geom_vline(xintercept=0) +
theme_minimal() +
labs(x="Regression coefficient",y="Domain level") +
guides(col = guide_legend(ncol = 1))community_domains %>%
bind_rows() %>%
pivot_longer(-elevation, names_to = "trait", values_to = "value") %>%
filter(trait %in% domain_predictions$trait) %>%
mutate(trait=factor(trait, levels=domain_predictions$trait)) %>%
mutate(elevation=as.numeric(elevation)) %>%
ggplot(aes(x=elevation, y=value)) +
geom_smooth(method = lm, formula = y ~ x, se = TRUE) +
#geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), se = TRUE) +
facet_wrap(~trait, ncol=5, scales="free") +
theme_minimal() +
labs(x="Elevation",y="Metabolic Capacity Index")9.4.3.1 GIFT test visualization
# Aggregate function-level GIFTs into overall Biosynthesis, Degradation and Structural GIFTs
GIFTs_domains <- to.domains(GIFTs_functions, GIFT_db)
domains <- GIFTs_domains %>%
as.data.frame()
# Get community-weighed average GIFTs per sample
GIFTs_domains_community <- to.community(GIFTs_domains, genome_counts_filt_filt %>% column_to_rownames(., "genome") %>% tss(), GIFT_db)
GIFTs_domains_community %>%
as.data.frame() %>%
rownames_to_column(var="sample") %>%
pivot_longer(!sample,names_to="trait",values_to="gift") %>%
left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
ggplot(aes(x=trait,y=sample,fill=gift)) +
geom_tile(colour="white", size=0.2)+
scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
facet_grid(Elevation ~ ., scales="free",space="free")University of Copenhagen, garazi.bideguren@sund.ku.dk↩︎
University of Copenhagen, ostaizka.aizpurua@sund.ku.dk↩︎
University of Valencia, jal4@uv.es↩︎
Centre National de la Recherche Scientifique, faubret@gmail.com↩︎
University of Valencia and Universidade do Porto, guillem.perez-lanuza@uv.es↩︎
, ↩︎
University of Lund, tobias.uller@biol.lu.se↩︎
University of Copenhagen, aoife.leonard@sund.ku.dk↩︎
University of Copenhagen, antton.alberdi@sund.ku.dk↩︎